In [2]:
%matplotlib qt4
from __future__ import division

from models import tools, filters

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import defaultdict

sns.set_style("ticks", {"legend.frameon": True})
mpl.rcParams['axes.color_cycle'] = ['#02A5F4', 'orange', 'green']

In [30]:
data = tools.load_data(limit=3000000, offset=100000)
data = data[data['response_time'] < 20000]


Loaded 2992247 answers.
/home/pavel/.pyenv/al/local/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

By Country in a Box Plot


In [3]:
data = data[filters.countries(data)]

In [4]:
_, bins = pd.cut(data['response_time'], bins=20, retbins=True)
intervals = zip(bins[:-1], bins[1:])

responses = defaultdict(lambda: [])
for lower_bound, upper_bound in intervals:
    tools.echo('{}-{}'.format(lower_bound, upper_bound))
    for place in data['place_id'].unique():
        vals = data[(data['response_time'] >= lower_bound) &
                    (data['response_time'] < upper_bound) &
                    (data['place_id'] == place)]
        responses[place].append(vals['is_correct'].mean())


14249.85-14999.0

In [5]:
X = [[] for _ in intervals]
for place in responses:
    for i, value in enumerate(responses[place]):
        if np.isfinite(value):
            X[i].append(value)

In [7]:
labels = ['({}, {}]'.format(int(i), int(j)) for i, j in intervals]

plt.figure(num=None, figsize=(9, 6), dpi=120)
plt.xticks(rotation=70)
bp = plt.boxplot(X, labels=labels, showfliers=False)
plt.xlabel('Response time in miliseconds')
plt.ylabel('Probability of recall')
plt.subplots_adjust(bottom=0.25)

plt.setp(bp['medians'], color='orange')
plt.setp(bp['boxes'], color='#02A5F4')
plt.setp(bp['whiskers'], color='#02A5F4')
plt.setp(bp['fliers'], color='#02A5F4', marker='+')

plt.tight_layout()

Success vs Response Time


In [31]:
previous_is_correct = {}
groups = data.groupby(['user_id', 'place_id'])

for i, (_, group) in enumerate(groups):
    prev_idx = None
    for idx in sorted(group.index):
        if prev_idx is not None:
            previous_is_correct[idx] = group.ix[prev_idx]['is_correct']
        prev_idx = idx
    if i % 10000 == 0:
        tools.echo(i)


1130000

In [32]:
d1 = data
d1['response_bin'] = d1['response_time'] // 500
d1 = d1[['is_correct', 'response_bin']]

In [33]:
d2 = pd.DataFrame(previous_is_correct.items(), columns=['id', 'previous_correct'])
d2 = d2.set_index('id')

In [34]:
d = pd.concat([d1, d2], axis=1, join='inner')

In [35]:
prev_incorrect = d[d['previous_correct'] == 0]
prev_correct = d[d['previous_correct'] == 1]

In [88]:
def grouping(df):
    gs = df[['is_correct', 'response_bin']].groupby(['response_bin'])
    return gs.sum() / gs.count()

In [109]:
plt.figure(num=None, figsize=(5, 4), dpi=120)
plt.plot(grouping(prev_correct), '.-', label='previous correct')
plt.plot(grouping(prev_incorrect), '.-', label='previous incorrect')
plt.xlabel('Response time in seconds')
plt.ylabel('Success')
legend = plt.legend(loc='lower right', prop={'size': 12})
legend.get_frame().set_linewidth(1)
plt.xticks(range(0, 21, 2))
plt.tight_layout()

In [103]:
plt.figure(num=None, figsize=(5, 4), dpi=120)
plt.hist([list(prev_correct['response_bin']),
          list(prev_incorrect['response_bin'])],
         bins=20, rwidth=0.8,
         label=['previous correct', 'previous incorrect'])
plt.yscale('log')
plt.xlabel('Response time in seconds')
plt.ylabel('Number of answers')
plt.xticks(range(0, 21, 2))
legend = plt.legend(prop={'size': 12})
legend.get_frame().set_linewidth(1)
plt.tight_layout()

In [108]:
print 'Previous correct:'
print grouping(prev_correct).to_dict()
print ''
print 'Previous incorrect:'
print grouping(prev_incorrect).to_dict()


Previous correct:
{'is_correct': {0.0: 0.16241560390097523, 0.5: 0.45839753466872113, 2.0: 0.94703097610709874, 3.0: 0.91035782747603833, 4.0: 0.87432298780200635, 5.0: 0.84153061017018072, 2.5: 0.92948168058256131, 1.0: 0.93804279268351853, 8.0: 0.78319399178121019, 9.0: 0.77434936908517349, 10.0: 0.75907505490246741, 11.0: 0.75359307359307359, 12.0: 0.74181498571742477, 13.0: 0.74379940038157533, 14.0: 0.72151474530831095, 15.0: 0.74357961280126428, 16.0: 0.72674418604651159, 4.5: 0.85716086737266772, 18.0: 0.73603723404255317, 19.0: 0.74449685534591192, 11.5: 0.74628879892037792, 12.5: 0.73995157384987897, 5.5: 0.83011942304709718, 13.5: 0.74624530663329158, 8.5: 0.77624193683505072, 9.5: 0.76480736055204135, 14.5: 0.7488064634594197, 15.5: 0.74769230769230766, 6.0: 0.81813484887625931, 17.5: 0.73011015911872701, 7.0: 0.7984419682477073, 1.5: 0.95841366283032459, 16.5: 0.7407605784681307, 19.5: 0.72490118577075102, 3.5: 0.89028765933554987, 10.5: 0.76096962258361456, 18.5: 0.72233820459290188, 17.0: 0.72171428571428575, 6.5: 0.80792497259550977, 7.5: 0.79116537978093016}}

Previous incorrect:
{'is_correct': {0.0: 0.018337408312958436, 0.5: 0.059914407988587728, 2.0: 0.75385315959086452, 3.0: 0.7438426626323752, 4.0: 0.71575356541038182, 5.0: 0.68165565723289501, 2.5: 0.75589375448671936, 1.0: 0.54084675014907568, 8.0: 0.63706422018348619, 9.0: 0.62819176654507558, 10.0: 0.60323886639676116, 11.0: 0.59540954095409537, 12.0: 0.60588901472253676, 13.0: 0.6216216216216216, 14.0: 0.57520510483135823, 15.0: 0.60812294182217341, 16.0: 0.57517241379310347, 4.5: 0.70247483013440459, 18.0: 0.58429118773946365, 19.0: 0.57202505219206679, 11.5: 0.61020408163265305, 12.5: 0.59514435695538059, 5.5: 0.67432412468606884, 13.5: 0.60971659919028343, 8.5: 0.63550373955125383, 9.5: 0.60639534883720925, 14.5: 0.56538839724680434, 15.5: 0.59576968272620445, 6.0: 0.66985985453255281, 17.5: 0.59008264462809923, 7.0: 0.65492404400209536, 1.5: 0.74372831598612221, 16.5: 0.6076487252124646, 19.5: 0.59267734553775742, 3.5: 0.72282980177717016, 10.5: 0.60840534171249017, 18.5: 0.58648111332007957, 17.0: 0.61654135338345861, 6.5: 0.65019094380796505, 7.5: 0.63664888609978032}}

In [ ]: